#This code to filter the data and convert Affymetrix probe set IDs to gene symbols and check both data dimentions
# Dataset: GEO-ID: GSE20295
# Array platform: Affymetrix HG-U133A

# Download the data into the current working directory
#ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE20nnn/GSE20295/matrix/GSE20295_series_matrix.txt.gz



# Read the data into R
zhangdatgeo = read.table(gzfile("./GSE20295_series_matrix.txt.gz"), header=T, comment.char="!", sep="\t")

# Use the labels in the first column as row names
zhangdat = zhangdatgeo[,2:ncol(zhangdatgeo)]
zhangdat
rownames(zhangdat) = zhangdatgeo[,1]

# Filter out tissue samples which are not from the midbrain / substantia nigra region
zhang_tissues = as.matrix(read.table(gzfile("./GSE20295_series_matrix.txt.gz"), header=F, nrows=1, skip=39, sep="\t"))
zhang_tissues = zhang_tissues[2:length(zhang_tissues)]
table(zhang_tissues)

# select only substantia nigra samples
nigra_ind = which(zhang_tissues == "Postmortem brain whole substantia nigra")

zhang_outcome = as.matrix(read.table(gzfile("./GSE20295_series_matrix.txt.gz"), header=F, nrows=1, skip=41, sep="\t"))
zhang_outcome = zhang_outcome[2:length(zhang_outcome)]
table(zhang_outcome)

zhangfilt = zhangdat[,nigra_ind]
dim(zhangfilt)

zhang_outcomefilt = zhang_outcome[nigra_ind]
table(zhang_outcomefilt)


# convert Affymetrix probe set IDs to gene symbols
conv_ids <- mapIds(hgu133a.db, keys=as.character(rownames(zhangfilt)), c("SYMBOL"), keytype="PROBEID")
head(conv_ids)

# Read the data 
zhangdatgeo = read.table(gzfile("./GSE20295_series_matrix.txt.gz"), header=T, comment.char="!", sep="\t")

# Use the labels in the first column as row names
zhangdat = zhangdatgeo[,2:ncol(zhangdatgeo)]
rownames(zhangdat) = zhangdatgeo[,1]

# Filter out tissue samples which are not from the midbrain / substantia nigra region
zhang_tissues = as.matrix(read.table(gzfile("./GSE20295_series_matrix.txt.gz"), header=F, nrows=1, skip=39, sep="\t"))
zhang_tissues
zhang_tissues = zhang_tissues[2:length(zhang_tissues)]
table(zhang_tissues)

# select only substantia nigra samples
nigra_ind = which(zhang_tissues == "Postmortem brain whole substantia nigra")
zhang_outcome = as.matrix(read.table(gzfile("./GSE20295_series_matrix.txt.gz"), header=F, nrows=1, skip=41, sep="\t"))
zhang_outcome = zhang_outcome[2:length(zhang_outcome)]
table(zhang_outcome)

zhangfilt = zhangdat[,nigra_ind]
dim(zhangfilt)

zhang_outcomefilt = zhang_outcome[nigra_ind]
table(zhang_outcomefilt)
zhang_outcomefilt

# convert Affymetrix probe set IDs to gene symbols
conv_ids <- mapIds(hgu133a.db, keys=as.character(rownames(zhangfilt)), c("SYMBOL"), keytype="PROBEID")
head(conv_ids)


#
# 2) Dataset GSE8397
# Download the data into the current working directory

# ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE8nnn/GSE8397/matrix/GSE8397-GPL96_series_matrix.txt.gz


# Read the data 
morandatgeo = read.table(gzfile("./GSE8397-GPL96_series_matrix.txt.gz"), header=T, comment.char="!", sep="\t")

# Use the labels in the first column as row names
morandat = morandatgeo[,2:ncol(morandatgeo)]
rownames(morandat) = morandatgeo[,1]

# Filter out tissue samples which are not from the midbrain / substantia nigra region
moran_tissues = as.matrix(read.table(gzfile("./GSE8397-GPL96_series_matrix.txt.gz"), header=F, nrows=1, skip=36, sep="\t"))
moran_tissues = moran_tissues[2:length(moran_tissues)]

nigra_ind = grep("substantia nigra",moran_tissues)

moran_outcome = as.matrix(read.table(gzfile("./GSE8397-GPL96_series_matrix.txt.gz"), header=F, nrows=1, skip=28, sep="\t"))
moran_outcome = moran_outcome[2:length(moran_outcome)]
moran_outcome[grep("control",moran_outcome)] = rep("control",length(grep("control",moran_outcome)))
moran_outcome[grep("Parkinson",moran_outcome)] = rep("parkinson",length(grep("Parkinson",moran_outcome)))
moranfilt = morandat[,nigra_ind]
dim(moranfilt)

moran_outcomefilt = moran_outcome[nigra_ind]
table(moran_outcomefilt)
#check the dimentions
all(rownames(zhangfilt) == rownames(moranfilt))
